#!/usr/bin/python27
#coding=utf8
import urllib2,sys,datetime
from bs4 import BeautifulSoup
import re
from StringIO import StringIO
import gzip
import HTMLParser
from cvm.common.DataBase import DataBase

reload(sys)

sys.setdefaultencoding('utf8')

"""
从 http://news.baidu.com/?tn=news  搜索 ‘汽车 政策’
注意：headres 一定要 格式 正确   直接写到数据库
Author:Yyb
Date:20170228
Email:yangyingbo@unimlink
"""


def main():  ####获取网页

    hearders = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip,deflate',
    'Accept-Language':'zh-CN,zh;q=0.8',
    'Cache-Control':'max-age = 0',
    'Connection':'keep-alive',
    'Cookie': 'BDUSS=EhCWlJhMmN5aEttUmNqREwyQWJtb0pyUUpYaVFvU1pnU3c4TkExRmtzTFNXYjlZSVFBQUFBJCQAAAAAAAAAAAEAAAAtohgLMTA2NTY0MjQ4NgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANLMl1jSzJdYd;BAIDUID=51E39C665D1E257F112008D21BFF81FC:FG=1;PSTM=1487559921;BIDUPSID=434FDAF8311C127BED258CB8F62782EA;MCITY=-%3A;LOCALGX=%u4E0A%u6D77%7C%32%33%35%34%7C%u4E0A%u6D77%7C%32%33%35%34;BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0;BDRCVFR[C0p6oIjvx-c]=mk3SLVN4HKm;BD_CK_SAM=1;PSINO=2;BDSVRTM=741;H_PS_PSSID=',
    'Host':'news.baidu.com',
    'Upgrade - Insecure - Requests':'1',
    'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/55.0.2883.87Safari/537.36'
    }
    search_key = "汽车 政策"
    urled_search_key = urllib2.quote(search_key)
    url = 'http://news.baidu.com/ns?word='+urled_search_key+'&tn=news&from=news&cl=2&rn=20&ct=1'
    url1 ='http://www.baidu.com'
    # url = 'http://news.baidu.com/ns'
    form_data = 'word='+urled_search_key+'&tn=news&from=news&cl=2&rn=20&ct=1'
    getHtml(url, hearders)
    car_news_before_page = 10
    count_paga_now = 1
    while count_paga_now<car_news_before_page:
        url = 'http://news.baidu.com/ns?word='+urled_search_key+'&pn=%d&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0&rsv_page=1' % (count_paga_now*20)
        count_paga_now += 1
        # print url
        getHtml(url, hearders)

def getHtml(url,hearders):
    response = urllib2.Request(url, headers=hearders)
    res = urllib2.urlopen(response)
    data = res.read()
    # print data
    bi = StringIO(data)
    gf = gzip.GzipFile(fileobj=bi, mode="rb")
    # print(gf.read().decode("gbk"))
    page = gf.read().decode('utf-8')
    html_parser = HTMLParser.HTMLParser()
    html = html_parser.unescape(page)
    dataBase = DataBase()
    parse_html(html, dataBase)


def parse_html(html, dataBase):  ###解析网页
    # print page
    """获取今天的最大id"""
    # dataBase.session.set_character_set('utf8')
    sql='SET NAMES utf8;'
    dataBase.rumSql(sql)
    sql = 'SET CHARACTER SET utf8;'
    dataBase.rumSql(sql)
    sql = 'SET character_set_connection=utf8;'
    dataBase.rumSql(sql)
    sql = "select max(order_flag) from cvm_car_news where news_date='%s'".decode('utf8') % (datetime.datetime.now().strftime('%Y年%m月%d日'),)
    max_over_flag = dataBase.session.execute(sql).fetchone()
    if max_over_flag == (None,):
        count = 0
    else:
        print max_over_flag
        count = int(list(max_over_flag)[0])
    news_list = []
    soup = BeautifulSoup(html, 'html.parser')
    news = soup.find_all('div', 'result')
    for news_i in news:
        soup_1 = BeautifulSoup(str(news_i), 'html.parser')
        # print soup_1
        title_tmp = str(soup_1.div.a).replace('\n', '')
        # print title_tmp
        url_link_tmp = re.sub('^<a.*href="', '', title_tmp)
        url_link = re.sub('".*</a>', '', url_link_tmp).decode('utf-8')
        # print url_link
        title_tmp = re.sub('^<a.*target="_blank">', '', title_tmp)
        title = title_tmp.replace('</a>', '').replace('<em>', '').replace('</em>', '').decode('utf-8')
        # print title
        try:
            img_url = str(soup_1.div.img['src'])
        except Exception, e:
            img_url = ''
        img_url = img_url.decode('utf-8')
        # print img_url
        src_mews_tmp = str(soup_1.div.p)
        src_mews_tmp = src_mews_tmp.replace('<p class="c-author">', '').replace('</p>', '')
        src_mews = src_mews_tmp.split(' ')[0].decode('utf-8')
        new_time = src_mews_tmp.split(' ')[-1].decode('utf-8')
        if '前' in new_time:
            count += 1
            new_time = datetime.datetime.now().strftime('%Y年%m月%d日 %H:%M')
        else:
            break
        # print src_mews
        # print new_time
        context_tmp = soup_1.find_all('div', 'c-span18 c-span-last')
        for context_tmp_i in context_tmp:
            ccc = re.sub('^<div.*</p>', '', str(context_tmp_i))
            ccc1 = re.sub('<span.*</div>', '', ccc).replace('<em>', '').replace('</em>', '').decode('utf-8')
            break
        # print ccc1
        news_list.append((title,src_mews,url_link,img_url,ccc1,count,datetime.datetime.now().strftime('%Y年%m月%d日'),datetime.datetime.now().strftime('%H:%M')))
    update_or_insert_mysql(news_list, dataBase)

def update_or_insert_mysql(news_list,dataBase):
    for news_list_i in news_list:
        title = news_list_i[0]
        date_string = news_list_i[6]
        sql = "select title from cvm_car_news where title='%s' and news_date='%s'" % (title,date_string)
        ssss = dataBase.session.execute(sql).fetchone()
        print ssss
        if ssss is None:  ##now insert
            sql_1 = '''
            insert into cvm_car_news (title,src_news,url_link,img_link,context,order_flag,news_date,news_time,create_time)
            values('%s','%s','%s','%s','%s',%d,'%s','%s',now())
            ''' % (news_list_i[0],news_list_i[1],news_list_i[2],news_list_i[3],news_list_i[4],news_list_i[5],news_list_i[6],news_list_i[7])
            dataBase.rumSql(sql_1)
            dataBase.commit()
        else:             ## now update
            pass


if __name__ == '__main__':
    main()